***** Anomaly Detection using Isolation Forest *****¶import datetime
import pandas as pd
import requests
import matplotlib as mpl
import os
import plotly.express as px
import numpy as np
from sklearn.ensemble import IsolationForest
from fbprophet import Prophet
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import matplotlib.pyplot as plt
from matplotlib import pyplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import ppscore as pps
import seaborn as sns
import datasist as ds
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.preprocessing import OrdinalEncoder
from fancyimpute import KNN
import warnings
warnings.simplefilter('ignore')
mpl.rcParams['figure.figsize'] = (10,8)
mpl.rcParams['axes.grid'] = False
print("Libraries imported succesfully")
df_ads = pd.read_csv('ads_challenge.csv')
display(df_ads)
# showing data types of the features
df_ads.dtypes
# converting data datatype to datetime
df_ads.Date = pd.to_datetime(df_ads['Date'])
df_ads.dtypes
# showing information of the data type, You can also use df.summary() for the detailed summary.
df_ads.info()
# Showing Missing Values
df_ads.isnull().sum()/len(df_ads)*100
# Shape of the dataset
df_ads.shape
# converting object data type to float
df_ads['ad_type1_impressions'] = df_ads['ad_type1_impressions'].str.replace(',','').astype('float')
df_ads['ad_type2_impressions'] = df_ads['ad_type2_impressions'].str.replace(',','').astype('float')
df_ads['ad_type1_CTR'] = df_ads['ad_type1_CTR'].str.replace('%','').astype('float') * 10**(-2)
df_ads['ad_type2_CTR'] = df_ads['ad_type2_CTR'].str.replace('%','').astype('float')* 10**(-2)
df_ads['ad_type2_videos_completed'] = df_ads['ad_type2_videos_completed'].str.replace('%','').astype('float')* 10**(-2)
df_ads.head()
# PPS- score of the variables
pps.predictors(df_ads, "ad_type2_CTR")
# Plotting Feature Importance
plt.figure(figsize=(20,3))
df_predictors = pps.predictors(df_ads, y="ad_type2_CTR")
display(sns.barplot(data=df_predictors, x="x", y="ppscore"))
# creating a dataframe with the important features selected.
#For now I am selecting impressions as well as type 2 videos completed.
# We can omit them as their importance is not significant but this decision can be taken based on doman expertise knowledge
# to improve the performance further. For now I am keeping these variables.
ctr_video = df_ads[['Date','Country','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR','ad_type1_CTR']]
# Checking for value where type2 CTR and videos completed both are null.
ctr_video[(ctr_video.ad_type2_CTR.isna()) &(~ctr_video.ad_type2_videos_completed.isna())]
# checking for data where country data is not provided
ctr_video[ctr_video.Country.isna()]
# Deleting(dropping) rows where impressions and videos completed are null and CTR is null
ctr_video.dropna(subset=['Date','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR'], inplace = True, how = 'all')
ctr_video[ctr_video.ad_type2_CTR.isna()]
# setting Date as the Index
ctr_video = ctr_video.set_index('Date')
# Daily CTR Trend of United States
fig = px.line(df_ads[df_ads.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type1_CTR','ad_type1_impressions'],title = 'Type-1 CTR')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
As you can see that the data is entirely missing from Jan 9 till Feb 14th for both impressions and CTR for ads group-1 There can be multiple reasons for it
1) Data was not gathered/collected or measured due to some technical issue.
2) There was no data - no ads were shown during that period.
In order to solve the above cases , I am making as assumption that due to covid start - no ads was shown during that period for group 1 therefore we don't have datapoints avaialble for that period of time. There are multiple ways to deal with it .for the sake of simplicity for now, I am replacing these values with 0.
# Replacing null value with 0
ctr_video['ad_type1_CTR'] = ctr_video['ad_type1_CTR'].fillna(0)
# Visualization the trend of United stated again after replacing null with 0
fig = px.line(ctr_video[ctr_video.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type1_CTR'],title = 'Type-1 CTR')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
ds.visualizations.histogram(ctr_video)
num_features = ctr_video.select_dtypes(include=['int64', 'float64']).columns
ds.visualizations.scatterplot(ctr_video,num_features=num_features,target='ad_type2_CTR')
ds.visualizations.autoviz(ctr_video)
# Finding numerical and categorical features
num_features = ctr_video.select_dtypes(include=['int64', 'float64']).columns
cat_features = ctr_video.select_dtypes(include=['object','category']).columns
# creating a dictionary for ordinal encoding -- > Doing ordinal encoding before KNN for object datatype
ordinal_enc_dict = {}
for col_name in ctr_video[cat_features]:
# Create Ordinal encoder for col
ordinal_enc_dict[col_name] = OrdinalEncoder()
col = ctr_video[col_name]
# Select non-null values of col
col_not_null = col[col.notnull()]
reshaped_vals = col_not_null.values.reshape(-1, 1)
encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
# Store the values to non-null values of the column
ctr_video.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)
# KNN imputer for imputin missing values using KNN
KNN_imputer = KNN()
# Impute the DataFrame
ctr_video.iloc[:, :] = KNN_imputer.fit_transform(ctr_video)
ctr_video.head()
# Loop over the column names
for col_name in cat_features:
# Reshape the data
reshaped = ctr_video[col_name].values.reshape(-1, 1)
# Perform inverse transform of the ordinally encoded columns
ctr_video[col_name] = ordinal_enc_dict[col_name].inverse_transform(reshaped)
ctr_video.head()
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
scaler = MinMaxScaler()
numeric_features = ['ad_type2_impressions','ad_type2_videos_completed','ad_type1_CTR','ad_type2_CTR']
# numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
# df2 = pd.DataFrame(scaler.fit_transform(df),
# columns=['ad_type1_impressions','ad_type1_CTR','ad_type2_impressions','ad_type2_videos_completed','ad_type2_CTR'],
# index = ['Date'])
ctr_video[numeric_features] = scaler.fit_transform(ctr_video[numeric_features])
ctr_video.head()
fig = px.line(ctr_video[ctr_video.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type2_impressions','ad_type2_CTR', 'ad_type1_CTR'],title = 'Impressions Vs Click Through Rate')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
fig = px.line(ctr_video[ctr_video.Country=="United States"].reset_index(), x = 'Date',y = ['ad_type2_impressions','ad_type2_CTR'],title = 'Impressions Vs Click Through Rate')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
fig = px.line(ctr_video[ctr_video.Country=="India"].reset_index(), x = 'Date',y = ['ad_type2_CTR','ad_type2_impressions'],title = 'Impressions Vs Click Through Rate')
fig.update_xaxes(
rangeslider_visible= True,
rangeselector=dict(
buttons = list([
dict(count = 1,label = '1m',step='month',stepmode = "backward"),
dict(count = 2,label = '6m',step='month',stepmode = "backward"),
dict(count = 3,label = '12m',step='month',stepmode = "todate"),
dict(step= 'all')
])
)
)
fig.show()
ctr_video.query("Country=='United States'")[['ad_type2_impressions','ad_type2_CTR','ad_type2_videos_completed']].plot()
ctr_video[['Country', 'ad_type2_impressions', 'ad_type2_videos_completed',
'ad_type2_CTR','ad_type1_CTR']].head()
# Unique country names list
ctr_video['Country'].unique()
# resetting the index
ctr_video = ctr_video.reset_index()
ctr_video.head()
# Grouping the data by Country. Since Naibia and Guam has less than 2 points so I am commenting them.
country_vid_ctr = ctr_video[(ctr_video.Country != 'Namibia') & (ctr_video.Country != 'Guam')].reset_index().groupby('Country')
country_vid_ctr.head()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import matplotlib.pyplot as plt
from matplotlib import pyplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
def plot_anomaly(df,metric_name):
df.load_date = pd.to_datetime(df['load_date'].astype(str), format="%Y-%m-%d")
dates = df.load_date
#identify the anomaly points and create a array of its values for plot
bool_array = (abs(df['anomaly']) > 0)
actuals = df["actuals"][-len(bool_array):]
anomaly_points = bool_array * actuals
anomaly_points[anomaly_points == 0] = np.nan
#A dictionary for conditional format table based on anomaly
color_map = {0: "aliceblue", 1: "yellow", 2: "red"}
#Table which includes Date,Actuals,Change occured from previous point
table = go.Table(
domain=dict(x=[0, 1],
y=[0, 0.3]),
columnwidth=[1, 2],
header=dict(height=20,
values=[['<b>Date</b>'], ['<b>Actual Values </b>'], ['<b>% Change </b>'],['<b>Score</b>'],
['<b>anomaly_class</b>']],
font=dict(color=['rgb(45, 45, 45)'] * 5, size=14),
fill=dict(color='#d562be')),
cells=dict(values=[df.round(3)[k].tolist() for k in ['load_date', 'actuals', 'percentage_change','score','anomaly_class']],
line=dict(color='#506784'),
align=['center'] * 5,
font=dict(color=['rgb(40, 40, 40)'] * 5, size=12),
suffix=[None] + [''] + [''] + ['%'] + [''],
height=27,
fill=dict(color=[anomaly_df['anomaly_class'].map(color_map)],#map based on anomaly level from dictionary
)
))
#Plot the actuals points
Actuals = go.Scatter(name='Actuals',
x=dates,
y=df['actuals'],
xaxis='x1', yaxis='y1',
mode='lines',
marker=dict(size=12,
line=dict(width=1),
color="blue"))
#Highlight the anomaly points
anomalies_map = go.Scatter(name="Anomaly",
showlegend=True,
x=dates,
y=anomaly_points,
mode='markers',
xaxis='x1',
yaxis='y1',
marker=dict(color="red",
size=11,
line=dict(
color="red",
width=2)))
axis = dict(
showline=True,
zeroline=False,
showgrid=True,
mirror=True,
ticklen=4,
gridcolor='#ffffff',
tickfont=dict(size=10))
layout = dict(
width=1000,
height=865,
autosize=False,
title=metric_name,
margin=dict(t=75),
showlegend=True,
xaxis1=dict(axis, **dict(domain=[0, 1], anchor='y1', showticklabels=True)),
yaxis1=dict(axis, **dict(domain=[2 * 0.21 + 0.20, 1], anchor='x1', hoverformat='.2f')))
fig = go.Figure(data=[table, anomalies_map, Actuals], layout=layout)
iplot(fig)
pyplot.show()
def classify_anomalies(df,metric_name):
df['metric_name']=metric_name
df = df.sort_values(by='load_date', ascending=False)
#Shift actuals by one timestamp to find the percentage chage between current and previous data point
df['shift'] = df['actuals'].shift(-1)
df['percentage_change'] = ((df['actuals'] - df['shift']) / df['actuals']) * 100
#Categorise anomalies as 0-no anomaly, 1- low anomaly , 2 - high anomaly
df['anomaly'].loc[df['anomaly'] == 1] = 0
df['anomaly'].loc[df['anomaly'] == -1] = 2
df['anomaly_class'] = df['anomaly']
max_anomaly_score = df['score'].loc[df['anomaly_class'] == 2].max()
medium_percentile = df['score'].quantile(0.25)
df['anomaly_class'].loc[(df['score'] > max_anomaly_score) & (df['score'] <= medium_percentile)] = 1
return df
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
for country in country_vid_ctr.groups:
ctr = ctr_video[ctr_video.Country == country].reset_index()
to_model_columns=['ad_type2_impressions','ad_type2_videos_completed','ad_type1_CTR','ad_type2_CTR']
# Defining Isolation forest model
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.1), \
max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
# Fitting the data to the model
clf.fit(ctr[to_model_columns])
# Predicting Anomalies
pred = clf.predict(ctr[to_model_columns])
ctr['anomaly']=pred
outliers=ctr.loc[ctr['anomaly']==-1]
outlier_index=list(outliers.index)
print(country)
# Normalize and fit the metrics to a PCA to reduce the number of dimensions and then plot them in 3D highlighting
# the anomalies to visualize anomalies to make sure these naomalies makes sense
pca = PCA(n_components=3) # Reduce to k=3 dimensions
scaler = StandardScaler()
#normalize the metrics
X = scaler.fit_transform(ctr[to_model_columns])
X_reduce = pca.fit_transform(X)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_zlabel("x_composite_3")
# Plot the compressed data points
ax.scatter(X_reduce[:, 0], X_reduce[:, 1], zs=X_reduce[:, 2], s=4, lw=1, label="inliers",c="green")
# Plot x's for the ground truth outliers
ax.scatter(X_reduce[outlier_index,0],X_reduce[outlier_index,1], X_reduce[outlier_index,2],
lw=2, s=60, marker="x", c="red", label="outliers")
ax.legend()
plt.show()
del ctr['index']
clf.fit(ctr.iloc[:, 4:5])
pred = clf.predict(ctr.iloc[:, 4:5])
anomaly_df = pd.DataFrame()
anomaly_df['load_date'] = ctr['Date']
# Find decision function to find the score and classify anomalies
anomaly_df['score'] = clf.decision_function(ctr.iloc[:, 4:5])
anomaly_df['actuals'] = ctr.iloc[:, 4:5]
anomaly_df['anomaly'] = pred
# Get the indexes of outliers in order to compare the metrics with use case anomalies if required
outliers = anomaly_df.loc[anomaly_df['anomaly'] == -1]
outlier_index = list(outliers.index)
anomaly_df = classify_anomalies(anomaly_df, country)
# Plotting Anomaly
plot_anomaly(anomaly_df, country)